library(lubridate)
library(ggplot2)
library(here)

setwd(here("release_data","RawGtrends"))

#### READ IN DATA ####
c_dy <- read.csv("crime_3mo.csv")
w_dy <- read.csv("welfare_3mo.csv")
r_dy <- read.csv("report_3mo.csv")
z_dy <- read.csv("weather_3mo.csv")

c_mo <- read.csv("crime_monthly.csv")
w_mo <- read.csv("welfare_monthly.csv")
r_mo <- read.csv("report_monthly.csv")
z_mo <- read.csv("weather_monthly.csv")

wk_list <- grep("weekly",list.files(), value=T)

c_list <- grep("crime",wk_list, value=T)
w_list <- grep("welfare",wk_list, value=T)
r_list <- grep("report",wk_list, value=T)
z_list <- grep("weather",wk_list, value=T)

c_wk <- NULL
w_wk <- NULL
r_wk <- NULL
z_wk <- NULL

#we're dropping the last row bc it's the endpoint
#and only contains partial data

for(i in 1:length(c_list)){
  
  tmp<- read.csv(c_list[i],stringsAsFactors =F)
  tmp$pull <- i
  tmp <- tmp[-nrow(tmp),]
  c_wk<- rbind(c_wk,tmp)
  
  tmp<- read.csv(w_list[i],stringsAsFactors =F)
  tmp$pull <- i
  tmp <- tmp[-nrow(tmp),]
  w_wk<- rbind(w_wk,tmp)
  
  tmp<- read.csv(r_list[i],stringsAsFactors =F)
  tmp$pull <- i
  tmp <- tmp[-nrow(tmp),]
  r_wk<- rbind(r_wk,tmp)
  
  tmp<- read.csv(z_list[i],stringsAsFactors =F)
  tmp$pull <- i
  tmp <- tmp[-nrow(tmp),]
  z_wk<- rbind(z_wk,tmp)
  
}

rm(tmp)

#### CLEAN DATA ####
colnames(c_dy)[2:4]<-colnames(c_wk)[1:3]<- c("date","search","partial")
colnames(w_dy)[2:4]<-colnames(w_wk)[1:3]<- c("date","search","partial")
colnames(r_dy)[2:4]<-colnames(r_wk)[1:3]<- c("date","search","partial")
colnames(z_dy)[2:4]<-colnames(z_wk)[1:3]<- c("date","search","partial")

colnames(c_mo)<-colnames(w_mo)<-colnames(r_mo)<-colnames(z_mo)<- c("date","search","partial")

c_wk <- c_wk[!duplicated(c_wk$date),]
w_wk <- w_wk[!duplicated(w_wk$date),]
r_wk <- r_wk[!duplicated(r_wk$date),]
z_wk <- z_wk[!duplicated(z_wk$date),]

c_wk$date <- as.Date(c_wk$date)
w_wk$date <- as.Date(w_wk$date)
r_wk$date <- as.Date(r_wk$date)
z_wk$date <- as.Date(z_wk$date)

c_dy$date <- as.Date(c_dy$date)
w_dy$date <- as.Date(w_dy$date)
r_dy$date <- as.Date(r_dy$date)
z_dy$date <- as.Date(z_dy$date)

#### WEEK TO MONTH ####
c_wk$moyr <- paste(month(c_wk$date), year(c_wk$date))
w_wk$moyr <- paste(month(w_wk$date), year(w_wk$date))
r_wk$moyr <- paste(month(r_wk$date), year(r_wk$date))
z_wk$moyr <- paste(month(z_wk$date), year(z_wk$date))

agg_c <- aggregate(c_wk$search, list(c_wk$moyr,c_wk$pull), mean)
agg_w <- aggregate(w_wk$search, list(w_wk$moyr,w_wk$pull), mean)
agg_r <- aggregate(r_wk$search, list(r_wk$moyr,r_wk$pull), mean)
agg_z <- aggregate(z_wk$search, list(z_wk$moyr,z_wk$pull), mean)

colnames(agg_c) <- c("moyr","pull","search")
colnames(agg_w) <- c("moyr","pull","search")
colnames(agg_r) <- c("moyr","pull","search")
colnames(agg_z) <- c("moyr","pull","search")


c_mo$date <- as.Date(c_mo$date)
w_mo$date <- as.Date(w_mo$date)
r_mo$date <- as.Date(r_mo$date)
z_mo$date <- as.Date(z_mo$date)

c_mo$moyr <- paste(month(c_mo$date), year(c_mo$date))
w_mo$moyr <- paste(month(w_mo$date), year(w_mo$date))
r_mo$moyr <- paste(month(r_mo$date), year(r_mo$date))
z_mo$moyr <- paste(month(z_mo$date), year(z_mo$date))

#crime
c_mowk <- merge(c_mo, agg_c, by="moyr", all=T)
c_mowk <- c_mowk[!duplicated(c_mowk$date),]#rarely, the same date is pulled in multiple pulls - get rid of dupes
c_mowk$wk_adj <- c_mowk$search.x/c_mowk$search.y
cw_adj <- c_mowk[,c("moyr","wk_adj")]
c_wk2 <- merge(c_wk, cw_adj, by="moyr")
c_wk2$search_adj <- c_wk2$search*c_wk2$wk_adj

ggplot(c_wk2, aes(x=date, y=search_adj))+geom_point()

#welfare
w_mowk <- merge(w_mo, agg_w, by="moyr", all=T)
w_mowk <- w_mowk[!duplicated(w_mowk$date),]#rarely, the same date is pulled in multiple pulls - get rid of dupes
w_mowk$wk_adj <- w_mowk$search.x/w_mowk$search.y
ww_adj <- w_mowk[,c("moyr","wk_adj")]
w_wk2 <- merge(w_wk, ww_adj, by="moyr")
w_wk2$search_adj <- w_wk2$search*w_wk2$wk_adj

ggplot(w_wk2, aes(x=date, y=search_adj))+geom_point()

#report
r_mowk <- merge(r_mo, agg_r, by="moyr", all=T)
r_mowk <- r_mowk[!duplicated(r_mowk$date),]#rarely, the same date is pulled in multiple pulls - get rid of dupes

r_mowk$wk_adj <- r_mowk$search.x/r_mowk$search.y
rw_adj <- r_mowk[,c("moyr","wk_adj")]
r_wk2 <- merge(r_wk, rw_adj, by="moyr")
r_wk2$search_adj <- r_wk2$search*r_wk2$wk_adj

ggplot(r_wk2, aes(x=date, y=search_adj))+geom_point()


#weather
z_mowk <- merge(z_mo, agg_z, by="moyr", all=T)
z_mowk <- z_mowk[!duplicated(z_mowk$date),]#rarely, the same date is pulled in multiple pulls - get rid of dupes

z_mowk$wk_adj <- z_mowk$search.x/z_mowk$search.y
zw_adj <- z_mowk[,c("moyr","wk_adj")]
z_wk2 <- merge(z_wk, zw_adj, by="moyr")
z_wk2$search_adj <- z_wk2$search*z_wk2$wk_adj

ggplot(z_wk2, aes(x=date, y=search_adj))+geom_point()

#### DAY TO WEEK ####

c_wk <- c_wk2
w_wk <- w_wk2
r_wk <- r_wk2
z_wk <- z_wk2

rm(c_wk2,w_wk2,r_wk2,z_wk2,agg_c,agg_w,agg_r,agg_z,cw_adj,ww_adj,rw_adj,zw_adj)
rm(c_mowk,w_mowk,r_mowk,z_mowk)


c_wk$wyr <- paste(week(c_wk$date), year(c_wk$date))
w_wk$wyr <- paste(week(w_wk$date), year(w_wk$date))
r_wk$wyr <- paste(week(r_wk$date), year(r_wk$date))
z_wk$wyr <- paste(week(z_wk$date), year(z_wk$date))


c_dy$wyr <- paste(week(c_dy$date), year(c_dy$date))
w_dy$wyr <- paste(week(w_dy$date), year(w_dy$date))
r_dy$wyr <- paste(week(r_dy$date), year(r_dy$date))
z_dy$wyr <- paste(week(z_dy$date), year(z_dy$date))


agg_c <- aggregate(c_dy$search, list(c_dy$wyr,c_dy$pull), mean)
agg_w <- aggregate(w_dy$search, list(w_dy$wyr,w_dy$pull), mean)
agg_r <- aggregate(r_dy$search, list(r_dy$wyr,r_dy$pull), mean)
agg_z <- aggregate(z_dy$search, list(z_dy$wyr,z_dy$pull), mean)

colnames(agg_c) <- c("wyr","pull","search")
colnames(agg_w) <- c("wyr","pull","search")
colnames(agg_r) <- c("wyr","pull","search")
colnames(agg_z) <- c("wyr","pull","search")


#crime

cdy2 <- merge(agg_c, c_wk, by="wyr")
cdy2 <- cdy2[!duplicated(cdy2$date),] #remove dupes from pulls
cdy2$dy_adj <- cdy2$search_adj/cdy2$search.x
cdy2 <- cdy2[,c("wyr","dy_adj")]
c_dy2 <- merge(cdy2, c_dy, by="wyr")
c_dy2 <- na.omit(c_dy2)
c_dy2 <- c_dy2[!is.infinite(c_dy2$dy_adj),]
c_dy2$search_adj <- c_dy2$search* c_dy2$dy_adj

ggplot(c_dy2, aes(x=date,y=search_adj))+geom_point()+stat_smooth()+geom_vline(xintercept=as.Date("2017-01-20"))

#welfare
wdy2 <- merge(agg_w, w_wk, by="wyr")
wdy2 <- wdy2[!duplicated(wdy2$date),] #remove dupes from pulls

wdy2$dy_adj <- wdy2$search_adj/wdy2$search.x
wdy2 <- wdy2[,c("wyr","dy_adj")]
w_dy2 <- merge(wdy2, w_dy, by="wyr")
w_dy2 <- na.omit(w_dy2)
w_dy2 <- w_dy2[!is.infinite(w_dy2$dy_adj),]
w_dy2$search_adj <- w_dy2$search* w_dy2$dy_adj

ggplot(w_dy2, aes(x=date,y=search_adj))+geom_point()+stat_smooth()+geom_vline(xintercept=as.Date("2017-01-20"))


#report
rdy2 <- merge(agg_r, r_wk, by="wyr")
rdy2 <- rdy2[!duplicated(rdy2$date),] #remove dupes from pulls

rdy2$dy_adj <- rdy2$search_adj/rdy2$search.x
rdy2 <- rdy2[,c("wyr","dy_adj")]
r_dy2 <- merge(rdy2, r_dy, by="wyr")
r_dy2 <- na.omit(r_dy2)
r_dy2 <- r_dy2[!is.infinite(r_dy2$dy_adj),]
r_dy2$search_adj <- r_dy2$search* r_dy2$dy_adj

ggplot(r_dy2, aes(x=date,y=search_adj))+geom_point()+stat_smooth()+geom_vline(xintercept=as.Date("2017-01-20"))


#report
zdy2 <- merge(agg_z, z_wk, by="wyr")
zdy2 <- zdy2[!duplicated(zdy2$date),] #remove dupes from pulls

zdy2$dy_adj <- zdy2$search_adj/zdy2$search.x
zdy2 <- zdy2[,c("wyr","dy_adj")]
z_dy2 <- merge(zdy2, z_dy, by="wyr")
z_dy2 <- na.omit(z_dy2)
z_dy2 <- z_dy2[!is.infinite(z_dy2$dy_adj),]
z_dy2$search_adj <- z_dy2$search* z_dy2$dy_adj

ggplot(z_dy2, aes(x=date,y=search_adj))+geom_point()+stat_smooth()+geom_vline(xintercept=as.Date("2017-01-20"))

c_dy2 <- c_dy2[,c("date","search","search_adj")]
w_dy2 <- w_dy2[,c("date","search","search_adj")]
r_dy2 <- r_dy2[,c("date","search","search_adj")]
z_dy2 <- z_dy2[,c("date","search","search_adj")]

c_dy2 <- c_dy2[!duplicated(c_dy2$date),]
w_dy2 <- w_dy2[!duplicated(w_dy2$date),]
r_dy2 <- r_dy2[!duplicated(r_dy2$date),]
z_dy2 <- z_dy2[!duplicated(z_dy2$date),]

setwd(here("output"))

write.csv(c_dy2, file="gt_crime_daily.csv")
write.csv(w_dy2, file="gt_welfare_daily.csv")
write.csv(r_dy2, file="gt_report_daily.csv")
write.csv(z_dy2, file="gt_weather_daily.csv")